{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Exploring Dataset"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"# import pandas\n",
"import pandas as pd "
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"# read a dataset \n",
"df = pd.read_csv(\"../data/framingham.csv\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Head "
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"
\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" male | \n",
" age | \n",
" education | \n",
" currentSmoker | \n",
" cigsPerDay | \n",
" BPMeds | \n",
" prevalentStroke | \n",
" prevalentHyp | \n",
" diabetes | \n",
" totChol | \n",
" sysBP | \n",
" diaBP | \n",
" BMI | \n",
" heartRate | \n",
" glucose | \n",
" TenYearCHD | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 1 | \n",
" 39 | \n",
" 4.0 | \n",
" 0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 195.0 | \n",
" 106.0 | \n",
" 70.0 | \n",
" 26.97 | \n",
" 80.0 | \n",
" 77.0 | \n",
" 0 | \n",
"
\n",
" \n",
" 1 | \n",
" 0 | \n",
" 46 | \n",
" 2.0 | \n",
" 0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 250.0 | \n",
" 121.0 | \n",
" 81.0 | \n",
" 28.73 | \n",
" 95.0 | \n",
" 76.0 | \n",
" 0 | \n",
"
\n",
" \n",
" 2 | \n",
" 1 | \n",
" 48 | \n",
" 1.0 | \n",
" 1 | \n",
" 20.0 | \n",
" 0.0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 245.0 | \n",
" 127.5 | \n",
" 80.0 | \n",
" 25.34 | \n",
" 75.0 | \n",
" 70.0 | \n",
" 0 | \n",
"
\n",
" \n",
" 3 | \n",
" 0 | \n",
" 61 | \n",
" 3.0 | \n",
" 1 | \n",
" 30.0 | \n",
" 0.0 | \n",
" 0 | \n",
" 1 | \n",
" 0 | \n",
" 225.0 | \n",
" 150.0 | \n",
" 95.0 | \n",
" 28.58 | \n",
" 65.0 | \n",
" 103.0 | \n",
" 1 | \n",
"
\n",
" \n",
" 4 | \n",
" 0 | \n",
" 46 | \n",
" 3.0 | \n",
" 1 | \n",
" 23.0 | \n",
" 0.0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 285.0 | \n",
" 130.0 | \n",
" 84.0 | \n",
" 23.10 | \n",
" 85.0 | \n",
" 85.0 | \n",
" 0 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" male age education currentSmoker cigsPerDay BPMeds prevalentStroke \\\n",
"0 1 39 4.0 0 0.0 0.0 0 \n",
"1 0 46 2.0 0 0.0 0.0 0 \n",
"2 1 48 1.0 1 20.0 0.0 0 \n",
"3 0 61 3.0 1 30.0 0.0 0 \n",
"4 0 46 3.0 1 23.0 0.0 0 \n",
"\n",
" prevalentHyp diabetes totChol sysBP diaBP BMI heartRate glucose \\\n",
"0 0 0 195.0 106.0 70.0 26.97 80.0 77.0 \n",
"1 0 0 250.0 121.0 81.0 28.73 95.0 76.0 \n",
"2 0 0 245.0 127.5 80.0 25.34 75.0 70.0 \n",
"3 1 0 225.0 150.0 95.0 28.58 65.0 103.0 \n",
"4 0 0 285.0 130.0 84.0 23.10 85.0 85.0 \n",
"\n",
" TenYearCHD \n",
"0 0 \n",
"1 0 \n",
"2 0 \n",
"3 1 \n",
"4 0 "
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# head: by default shows first 5 rows \n",
"df.head() "
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" male | \n",
" age | \n",
" education | \n",
" currentSmoker | \n",
" cigsPerDay | \n",
" BPMeds | \n",
" prevalentStroke | \n",
" prevalentHyp | \n",
" diabetes | \n",
" totChol | \n",
" sysBP | \n",
" diaBP | \n",
" BMI | \n",
" heartRate | \n",
" glucose | \n",
" TenYearCHD | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 1 | \n",
" 39 | \n",
" 4.0 | \n",
" 0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 195.0 | \n",
" 106.0 | \n",
" 70.0 | \n",
" 26.97 | \n",
" 80.0 | \n",
" 77.0 | \n",
" 0 | \n",
"
\n",
" \n",
" 1 | \n",
" 0 | \n",
" 46 | \n",
" 2.0 | \n",
" 0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 250.0 | \n",
" 121.0 | \n",
" 81.0 | \n",
" 28.73 | \n",
" 95.0 | \n",
" 76.0 | \n",
" 0 | \n",
"
\n",
" \n",
" 2 | \n",
" 1 | \n",
" 48 | \n",
" 1.0 | \n",
" 1 | \n",
" 20.0 | \n",
" 0.0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 245.0 | \n",
" 127.5 | \n",
" 80.0 | \n",
" 25.34 | \n",
" 75.0 | \n",
" 70.0 | \n",
" 0 | \n",
"
\n",
" \n",
" 3 | \n",
" 0 | \n",
" 61 | \n",
" 3.0 | \n",
" 1 | \n",
" 30.0 | \n",
" 0.0 | \n",
" 0 | \n",
" 1 | \n",
" 0 | \n",
" 225.0 | \n",
" 150.0 | \n",
" 95.0 | \n",
" 28.58 | \n",
" 65.0 | \n",
" 103.0 | \n",
" 1 | \n",
"
\n",
" \n",
" 4 | \n",
" 0 | \n",
" 46 | \n",
" 3.0 | \n",
" 1 | \n",
" 23.0 | \n",
" 0.0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 285.0 | \n",
" 130.0 | \n",
" 84.0 | \n",
" 23.10 | \n",
" 85.0 | \n",
" 85.0 | \n",
" 0 | \n",
"
\n",
" \n",
" 5 | \n",
" 0 | \n",
" 43 | \n",
" 2.0 | \n",
" 0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0 | \n",
" 1 | \n",
" 0 | \n",
" 228.0 | \n",
" 180.0 | \n",
" 110.0 | \n",
" 30.30 | \n",
" 77.0 | \n",
" 99.0 | \n",
" 0 | \n",
"
\n",
" \n",
" 6 | \n",
" 0 | \n",
" 63 | \n",
" 1.0 | \n",
" 0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 205.0 | \n",
" 138.0 | \n",
" 71.0 | \n",
" 33.11 | \n",
" 60.0 | \n",
" 85.0 | \n",
" 1 | \n",
"
\n",
" \n",
" 7 | \n",
" 0 | \n",
" 45 | \n",
" 2.0 | \n",
" 1 | \n",
" 20.0 | \n",
" 0.0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 313.0 | \n",
" 100.0 | \n",
" 71.0 | \n",
" 21.68 | \n",
" 79.0 | \n",
" 78.0 | \n",
" 0 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" male age education currentSmoker cigsPerDay BPMeds prevalentStroke \\\n",
"0 1 39 4.0 0 0.0 0.0 0 \n",
"1 0 46 2.0 0 0.0 0.0 0 \n",
"2 1 48 1.0 1 20.0 0.0 0 \n",
"3 0 61 3.0 1 30.0 0.0 0 \n",
"4 0 46 3.0 1 23.0 0.0 0 \n",
"5 0 43 2.0 0 0.0 0.0 0 \n",
"6 0 63 1.0 0 0.0 0.0 0 \n",
"7 0 45 2.0 1 20.0 0.0 0 \n",
"\n",
" prevalentHyp diabetes totChol sysBP diaBP BMI heartRate glucose \\\n",
"0 0 0 195.0 106.0 70.0 26.97 80.0 77.0 \n",
"1 0 0 250.0 121.0 81.0 28.73 95.0 76.0 \n",
"2 0 0 245.0 127.5 80.0 25.34 75.0 70.0 \n",
"3 1 0 225.0 150.0 95.0 28.58 65.0 103.0 \n",
"4 0 0 285.0 130.0 84.0 23.10 85.0 85.0 \n",
"5 1 0 228.0 180.0 110.0 30.30 77.0 99.0 \n",
"6 0 0 205.0 138.0 71.0 33.11 60.0 85.0 \n",
"7 0 0 313.0 100.0 71.0 21.68 79.0 78.0 \n",
"\n",
" TenYearCHD \n",
"0 0 \n",
"1 0 \n",
"2 0 \n",
"3 1 \n",
"4 0 \n",
"5 0 \n",
"6 1 \n",
"7 0 "
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# head(n); n = 1, 2, 3, 4, ....\n",
"df.head(8)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Tail\n"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" male | \n",
" age | \n",
" education | \n",
" currentSmoker | \n",
" cigsPerDay | \n",
" BPMeds | \n",
" prevalentStroke | \n",
" prevalentHyp | \n",
" diabetes | \n",
" totChol | \n",
" sysBP | \n",
" diaBP | \n",
" BMI | \n",
" heartRate | \n",
" glucose | \n",
" TenYearCHD | \n",
"
\n",
" \n",
" \n",
" \n",
" 4235 | \n",
" 0 | \n",
" 48 | \n",
" 2.0 | \n",
" 1 | \n",
" 20.0 | \n",
" NaN | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 248.0 | \n",
" 131.0 | \n",
" 72.0 | \n",
" 22.00 | \n",
" 84.0 | \n",
" 86.0 | \n",
" 0 | \n",
"
\n",
" \n",
" 4236 | \n",
" 0 | \n",
" 44 | \n",
" 1.0 | \n",
" 1 | \n",
" 15.0 | \n",
" 0.0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 210.0 | \n",
" 126.5 | \n",
" 87.0 | \n",
" 19.16 | \n",
" 86.0 | \n",
" NaN | \n",
" 0 | \n",
"
\n",
" \n",
" 4237 | \n",
" 0 | \n",
" 52 | \n",
" 2.0 | \n",
" 0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 269.0 | \n",
" 133.5 | \n",
" 83.0 | \n",
" 21.47 | \n",
" 80.0 | \n",
" 107.0 | \n",
" 0 | \n",
"
\n",
" \n",
" 4238 | \n",
" 1 | \n",
" 40 | \n",
" 3.0 | \n",
" 0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0 | \n",
" 1 | \n",
" 0 | \n",
" 185.0 | \n",
" 141.0 | \n",
" 98.0 | \n",
" 25.60 | \n",
" 67.0 | \n",
" 72.0 | \n",
" 0 | \n",
"
\n",
" \n",
" 4239 | \n",
" 0 | \n",
" 39 | \n",
" 3.0 | \n",
" 1 | \n",
" 30.0 | \n",
" 0.0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 196.0 | \n",
" 133.0 | \n",
" 86.0 | \n",
" 20.91 | \n",
" 85.0 | \n",
" 80.0 | \n",
" 0 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" male age education currentSmoker cigsPerDay BPMeds \\\n",
"4235 0 48 2.0 1 20.0 NaN \n",
"4236 0 44 1.0 1 15.0 0.0 \n",
"4237 0 52 2.0 0 0.0 0.0 \n",
"4238 1 40 3.0 0 0.0 0.0 \n",
"4239 0 39 3.0 1 30.0 0.0 \n",
"\n",
" prevalentStroke prevalentHyp diabetes totChol sysBP diaBP BMI \\\n",
"4235 0 0 0 248.0 131.0 72.0 22.00 \n",
"4236 0 0 0 210.0 126.5 87.0 19.16 \n",
"4237 0 0 0 269.0 133.5 83.0 21.47 \n",
"4238 0 1 0 185.0 141.0 98.0 25.60 \n",
"4239 0 0 0 196.0 133.0 86.0 20.91 \n",
"\n",
" heartRate glucose TenYearCHD \n",
"4235 84.0 86.0 0 \n",
"4236 86.0 NaN 0 \n",
"4237 80.0 107.0 0 \n",
"4238 67.0 72.0 0 \n",
"4239 85.0 80.0 0 "
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# tail: by default shows last 5 rows \n",
"df.tail() "
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" male | \n",
" age | \n",
" education | \n",
" currentSmoker | \n",
" cigsPerDay | \n",
" BPMeds | \n",
" prevalentStroke | \n",
" prevalentHyp | \n",
" diabetes | \n",
" totChol | \n",
" sysBP | \n",
" diaBP | \n",
" BMI | \n",
" heartRate | \n",
" glucose | \n",
" TenYearCHD | \n",
"
\n",
" \n",
" \n",
" \n",
" 4232 | \n",
" 1 | \n",
" 68 | \n",
" 1.0 | \n",
" 0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0 | \n",
" 1 | \n",
" 0 | \n",
" 176.0 | \n",
" 168.0 | \n",
" 97.0 | \n",
" 23.14 | \n",
" 60.0 | \n",
" 79.0 | \n",
" 1 | \n",
"
\n",
" \n",
" 4233 | \n",
" 1 | \n",
" 50 | \n",
" 1.0 | \n",
" 1 | \n",
" 1.0 | \n",
" 0.0 | \n",
" 0 | \n",
" 1 | \n",
" 0 | \n",
" 313.0 | \n",
" 179.0 | \n",
" 92.0 | \n",
" 25.97 | \n",
" 66.0 | \n",
" 86.0 | \n",
" 1 | \n",
"
\n",
" \n",
" 4234 | \n",
" 1 | \n",
" 51 | \n",
" 3.0 | \n",
" 1 | \n",
" 43.0 | \n",
" 0.0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 207.0 | \n",
" 126.5 | \n",
" 80.0 | \n",
" 19.71 | \n",
" 65.0 | \n",
" 68.0 | \n",
" 0 | \n",
"
\n",
" \n",
" 4235 | \n",
" 0 | \n",
" 48 | \n",
" 2.0 | \n",
" 1 | \n",
" 20.0 | \n",
" NaN | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 248.0 | \n",
" 131.0 | \n",
" 72.0 | \n",
" 22.00 | \n",
" 84.0 | \n",
" 86.0 | \n",
" 0 | \n",
"
\n",
" \n",
" 4236 | \n",
" 0 | \n",
" 44 | \n",
" 1.0 | \n",
" 1 | \n",
" 15.0 | \n",
" 0.0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 210.0 | \n",
" 126.5 | \n",
" 87.0 | \n",
" 19.16 | \n",
" 86.0 | \n",
" NaN | \n",
" 0 | \n",
"
\n",
" \n",
" 4237 | \n",
" 0 | \n",
" 52 | \n",
" 2.0 | \n",
" 0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 269.0 | \n",
" 133.5 | \n",
" 83.0 | \n",
" 21.47 | \n",
" 80.0 | \n",
" 107.0 | \n",
" 0 | \n",
"
\n",
" \n",
" 4238 | \n",
" 1 | \n",
" 40 | \n",
" 3.0 | \n",
" 0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0 | \n",
" 1 | \n",
" 0 | \n",
" 185.0 | \n",
" 141.0 | \n",
" 98.0 | \n",
" 25.60 | \n",
" 67.0 | \n",
" 72.0 | \n",
" 0 | \n",
"
\n",
" \n",
" 4239 | \n",
" 0 | \n",
" 39 | \n",
" 3.0 | \n",
" 1 | \n",
" 30.0 | \n",
" 0.0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 196.0 | \n",
" 133.0 | \n",
" 86.0 | \n",
" 20.91 | \n",
" 85.0 | \n",
" 80.0 | \n",
" 0 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" male age education currentSmoker cigsPerDay BPMeds \\\n",
"4232 1 68 1.0 0 0.0 0.0 \n",
"4233 1 50 1.0 1 1.0 0.0 \n",
"4234 1 51 3.0 1 43.0 0.0 \n",
"4235 0 48 2.0 1 20.0 NaN \n",
"4236 0 44 1.0 1 15.0 0.0 \n",
"4237 0 52 2.0 0 0.0 0.0 \n",
"4238 1 40 3.0 0 0.0 0.0 \n",
"4239 0 39 3.0 1 30.0 0.0 \n",
"\n",
" prevalentStroke prevalentHyp diabetes totChol sysBP diaBP BMI \\\n",
"4232 0 1 0 176.0 168.0 97.0 23.14 \n",
"4233 0 1 0 313.0 179.0 92.0 25.97 \n",
"4234 0 0 0 207.0 126.5 80.0 19.71 \n",
"4235 0 0 0 248.0 131.0 72.0 22.00 \n",
"4236 0 0 0 210.0 126.5 87.0 19.16 \n",
"4237 0 0 0 269.0 133.5 83.0 21.47 \n",
"4238 0 1 0 185.0 141.0 98.0 25.60 \n",
"4239 0 0 0 196.0 133.0 86.0 20.91 \n",
"\n",
" heartRate glucose TenYearCHD \n",
"4232 60.0 79.0 1 \n",
"4233 66.0 86.0 1 \n",
"4234 65.0 68.0 0 \n",
"4235 84.0 86.0 0 \n",
"4236 86.0 NaN 0 \n",
"4237 80.0 107.0 0 \n",
"4238 67.0 72.0 0 \n",
"4239 85.0 80.0 0 "
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# tail(n); n=1, 2, 3, 4...\n",
"df.tail(8)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Columns Names "
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"Index(['male', 'age', 'education', 'currentSmoker', 'cigsPerDay', 'BPMeds',\n",
" 'prevalentStroke', 'prevalentHyp', 'diabetes', 'totChol', 'sysBP',\n",
" 'diaBP', 'BMI', 'heartRate', 'glucose', 'TenYearCHD'],\n",
" dtype='object')"
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Columns \n",
"df.columns"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Observations and Variables(Rows and Columns)"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(4240, 16)"
]
},
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# shape(rows x columns)\n",
"df.shape"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Data Types "
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"male int64\n",
"age int64\n",
"education float64\n",
"currentSmoker int64\n",
"cigsPerDay float64\n",
"BPMeds float64\n",
"prevalentStroke int64\n",
"prevalentHyp int64\n",
"diabetes int64\n",
"totChol float64\n",
"sysBP float64\n",
"diaBP float64\n",
"BMI float64\n",
"heartRate float64\n",
"glucose float64\n",
"TenYearCHD int64\n",
"dtype: object"
]
},
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# check datatypes \n",
"df.dtypes"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Basic Information"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"RangeIndex: 4240 entries, 0 to 4239\n",
"Data columns (total 16 columns):\n",
" # Column Non-Null Count Dtype \n",
"--- ------ -------------- ----- \n",
" 0 male 4240 non-null int64 \n",
" 1 age 4240 non-null int64 \n",
" 2 education 4135 non-null float64\n",
" 3 currentSmoker 4240 non-null int64 \n",
" 4 cigsPerDay 4211 non-null float64\n",
" 5 BPMeds 4187 non-null float64\n",
" 6 prevalentStroke 4240 non-null int64 \n",
" 7 prevalentHyp 4240 non-null int64 \n",
" 8 diabetes 4240 non-null int64 \n",
" 9 totChol 4190 non-null float64\n",
" 10 sysBP 4240 non-null float64\n",
" 11 diaBP 4240 non-null float64\n",
" 12 BMI 4221 non-null float64\n",
" 13 heartRate 4239 non-null float64\n",
" 14 glucose 3852 non-null float64\n",
" 15 TenYearCHD 4240 non-null int64 \n",
"dtypes: float64(9), int64(7)\n",
"memory usage: 530.1 KB\n"
]
}
],
"source": [
"# info: it gives an overview of datasets \n",
"df.info() "
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Numerical Summary of a Dataset"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" male | \n",
" age | \n",
" education | \n",
" currentSmoker | \n",
" cigsPerDay | \n",
" BPMeds | \n",
" prevalentStroke | \n",
" prevalentHyp | \n",
" diabetes | \n",
" totChol | \n",
" sysBP | \n",
" diaBP | \n",
" BMI | \n",
" heartRate | \n",
" glucose | \n",
" TenYearCHD | \n",
"
\n",
" \n",
" \n",
" \n",
" count | \n",
" 4240.000000 | \n",
" 4240.000000 | \n",
" 4135.000000 | \n",
" 4240.000000 | \n",
" 4211.000000 | \n",
" 4187.000000 | \n",
" 4240.000000 | \n",
" 4240.000000 | \n",
" 4240.000000 | \n",
" 4190.000000 | \n",
" 4240.000000 | \n",
" 4240.000000 | \n",
" 4221.000000 | \n",
" 4239.000000 | \n",
" 3852.000000 | \n",
" 4240.000000 | \n",
"
\n",
" \n",
" mean | \n",
" 0.429245 | \n",
" 49.580189 | \n",
" 1.979444 | \n",
" 0.494104 | \n",
" 9.005937 | \n",
" 0.029615 | \n",
" 0.005896 | \n",
" 0.310613 | \n",
" 0.025708 | \n",
" 236.699523 | \n",
" 132.354599 | \n",
" 82.897759 | \n",
" 25.800801 | \n",
" 75.878981 | \n",
" 81.963655 | \n",
" 0.151887 | \n",
"
\n",
" \n",
" std | \n",
" 0.495027 | \n",
" 8.572942 | \n",
" 1.019791 | \n",
" 0.500024 | \n",
" 11.922462 | \n",
" 0.169544 | \n",
" 0.076569 | \n",
" 0.462799 | \n",
" 0.158280 | \n",
" 44.591284 | \n",
" 22.033300 | \n",
" 11.910394 | \n",
" 4.079840 | \n",
" 12.025348 | \n",
" 23.954335 | \n",
" 0.358953 | \n",
"
\n",
" \n",
" min | \n",
" 0.000000 | \n",
" 32.000000 | \n",
" 1.000000 | \n",
" 0.000000 | \n",
" 0.000000 | \n",
" 0.000000 | \n",
" 0.000000 | \n",
" 0.000000 | \n",
" 0.000000 | \n",
" 107.000000 | \n",
" 83.500000 | \n",
" 48.000000 | \n",
" 15.540000 | \n",
" 44.000000 | \n",
" 40.000000 | \n",
" 0.000000 | \n",
"
\n",
" \n",
" 25% | \n",
" 0.000000 | \n",
" 42.000000 | \n",
" 1.000000 | \n",
" 0.000000 | \n",
" 0.000000 | \n",
" 0.000000 | \n",
" 0.000000 | \n",
" 0.000000 | \n",
" 0.000000 | \n",
" 206.000000 | \n",
" 117.000000 | \n",
" 75.000000 | \n",
" 23.070000 | \n",
" 68.000000 | \n",
" 71.000000 | \n",
" 0.000000 | \n",
"
\n",
" \n",
" 50% | \n",
" 0.000000 | \n",
" 49.000000 | \n",
" 2.000000 | \n",
" 0.000000 | \n",
" 0.000000 | \n",
" 0.000000 | \n",
" 0.000000 | \n",
" 0.000000 | \n",
" 0.000000 | \n",
" 234.000000 | \n",
" 128.000000 | \n",
" 82.000000 | \n",
" 25.400000 | \n",
" 75.000000 | \n",
" 78.000000 | \n",
" 0.000000 | \n",
"
\n",
" \n",
" 75% | \n",
" 1.000000 | \n",
" 56.000000 | \n",
" 3.000000 | \n",
" 1.000000 | \n",
" 20.000000 | \n",
" 0.000000 | \n",
" 0.000000 | \n",
" 1.000000 | \n",
" 0.000000 | \n",
" 263.000000 | \n",
" 144.000000 | \n",
" 90.000000 | \n",
" 28.040000 | \n",
" 83.000000 | \n",
" 87.000000 | \n",
" 0.000000 | \n",
"
\n",
" \n",
" max | \n",
" 1.000000 | \n",
" 70.000000 | \n",
" 4.000000 | \n",
" 1.000000 | \n",
" 70.000000 | \n",
" 1.000000 | \n",
" 1.000000 | \n",
" 1.000000 | \n",
" 1.000000 | \n",
" 696.000000 | \n",
" 295.000000 | \n",
" 142.500000 | \n",
" 56.800000 | \n",
" 143.000000 | \n",
" 394.000000 | \n",
" 1.000000 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" male age education currentSmoker cigsPerDay \\\n",
"count 4240.000000 4240.000000 4135.000000 4240.000000 4211.000000 \n",
"mean 0.429245 49.580189 1.979444 0.494104 9.005937 \n",
"std 0.495027 8.572942 1.019791 0.500024 11.922462 \n",
"min 0.000000 32.000000 1.000000 0.000000 0.000000 \n",
"25% 0.000000 42.000000 1.000000 0.000000 0.000000 \n",
"50% 0.000000 49.000000 2.000000 0.000000 0.000000 \n",
"75% 1.000000 56.000000 3.000000 1.000000 20.000000 \n",
"max 1.000000 70.000000 4.000000 1.000000 70.000000 \n",
"\n",
" BPMeds prevalentStroke prevalentHyp diabetes totChol \\\n",
"count 4187.000000 4240.000000 4240.000000 4240.000000 4190.000000 \n",
"mean 0.029615 0.005896 0.310613 0.025708 236.699523 \n",
"std 0.169544 0.076569 0.462799 0.158280 44.591284 \n",
"min 0.000000 0.000000 0.000000 0.000000 107.000000 \n",
"25% 0.000000 0.000000 0.000000 0.000000 206.000000 \n",
"50% 0.000000 0.000000 0.000000 0.000000 234.000000 \n",
"75% 0.000000 0.000000 1.000000 0.000000 263.000000 \n",
"max 1.000000 1.000000 1.000000 1.000000 696.000000 \n",
"\n",
" sysBP diaBP BMI heartRate glucose \\\n",
"count 4240.000000 4240.000000 4221.000000 4239.000000 3852.000000 \n",
"mean 132.354599 82.897759 25.800801 75.878981 81.963655 \n",
"std 22.033300 11.910394 4.079840 12.025348 23.954335 \n",
"min 83.500000 48.000000 15.540000 44.000000 40.000000 \n",
"25% 117.000000 75.000000 23.070000 68.000000 71.000000 \n",
"50% 128.000000 82.000000 25.400000 75.000000 78.000000 \n",
"75% 144.000000 90.000000 28.040000 83.000000 87.000000 \n",
"max 295.000000 142.500000 56.800000 143.000000 394.000000 \n",
"\n",
" TenYearCHD \n",
"count 4240.000000 \n",
"mean 0.151887 \n",
"std 0.358953 \n",
"min 0.000000 \n",
"25% 0.000000 \n",
"50% 0.000000 \n",
"75% 0.000000 \n",
"max 1.000000 "
]
},
"execution_count": 11,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# describe: it gives summary statistics or five number summary \n",
"df.describe() "
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" count | \n",
" mean | \n",
" std | \n",
" min | \n",
" 25% | \n",
" 50% | \n",
" 75% | \n",
" max | \n",
"
\n",
" \n",
" \n",
" \n",
" male | \n",
" 4240.0 | \n",
" 0.429245 | \n",
" 0.495027 | \n",
" 0.00 | \n",
" 0.00 | \n",
" 0.0 | \n",
" 1.00 | \n",
" 1.0 | \n",
"
\n",
" \n",
" age | \n",
" 4240.0 | \n",
" 49.580189 | \n",
" 8.572942 | \n",
" 32.00 | \n",
" 42.00 | \n",
" 49.0 | \n",
" 56.00 | \n",
" 70.0 | \n",
"
\n",
" \n",
" education | \n",
" 4135.0 | \n",
" 1.979444 | \n",
" 1.019791 | \n",
" 1.00 | \n",
" 1.00 | \n",
" 2.0 | \n",
" 3.00 | \n",
" 4.0 | \n",
"
\n",
" \n",
" currentSmoker | \n",
" 4240.0 | \n",
" 0.494104 | \n",
" 0.500024 | \n",
" 0.00 | \n",
" 0.00 | \n",
" 0.0 | \n",
" 1.00 | \n",
" 1.0 | \n",
"
\n",
" \n",
" cigsPerDay | \n",
" 4211.0 | \n",
" 9.005937 | \n",
" 11.922462 | \n",
" 0.00 | \n",
" 0.00 | \n",
" 0.0 | \n",
" 20.00 | \n",
" 70.0 | \n",
"
\n",
" \n",
" BPMeds | \n",
" 4187.0 | \n",
" 0.029615 | \n",
" 0.169544 | \n",
" 0.00 | \n",
" 0.00 | \n",
" 0.0 | \n",
" 0.00 | \n",
" 1.0 | \n",
"
\n",
" \n",
" prevalentStroke | \n",
" 4240.0 | \n",
" 0.005896 | \n",
" 0.076569 | \n",
" 0.00 | \n",
" 0.00 | \n",
" 0.0 | \n",
" 0.00 | \n",
" 1.0 | \n",
"
\n",
" \n",
" prevalentHyp | \n",
" 4240.0 | \n",
" 0.310613 | \n",
" 0.462799 | \n",
" 0.00 | \n",
" 0.00 | \n",
" 0.0 | \n",
" 1.00 | \n",
" 1.0 | \n",
"
\n",
" \n",
" diabetes | \n",
" 4240.0 | \n",
" 0.025708 | \n",
" 0.158280 | \n",
" 0.00 | \n",
" 0.00 | \n",
" 0.0 | \n",
" 0.00 | \n",
" 1.0 | \n",
"
\n",
" \n",
" totChol | \n",
" 4190.0 | \n",
" 236.699523 | \n",
" 44.591284 | \n",
" 107.00 | \n",
" 206.00 | \n",
" 234.0 | \n",
" 263.00 | \n",
" 696.0 | \n",
"
\n",
" \n",
" sysBP | \n",
" 4240.0 | \n",
" 132.354599 | \n",
" 22.033300 | \n",
" 83.50 | \n",
" 117.00 | \n",
" 128.0 | \n",
" 144.00 | \n",
" 295.0 | \n",
"
\n",
" \n",
" diaBP | \n",
" 4240.0 | \n",
" 82.897759 | \n",
" 11.910394 | \n",
" 48.00 | \n",
" 75.00 | \n",
" 82.0 | \n",
" 90.00 | \n",
" 142.5 | \n",
"
\n",
" \n",
" BMI | \n",
" 4221.0 | \n",
" 25.800801 | \n",
" 4.079840 | \n",
" 15.54 | \n",
" 23.07 | \n",
" 25.4 | \n",
" 28.04 | \n",
" 56.8 | \n",
"
\n",
" \n",
" heartRate | \n",
" 4239.0 | \n",
" 75.878981 | \n",
" 12.025348 | \n",
" 44.00 | \n",
" 68.00 | \n",
" 75.0 | \n",
" 83.00 | \n",
" 143.0 | \n",
"
\n",
" \n",
" glucose | \n",
" 3852.0 | \n",
" 81.963655 | \n",
" 23.954335 | \n",
" 40.00 | \n",
" 71.00 | \n",
" 78.0 | \n",
" 87.00 | \n",
" 394.0 | \n",
"
\n",
" \n",
" TenYearCHD | \n",
" 4240.0 | \n",
" 0.151887 | \n",
" 0.358953 | \n",
" 0.00 | \n",
" 0.00 | \n",
" 0.0 | \n",
" 0.00 | \n",
" 1.0 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" count mean std min 25% 50% 75% \\\n",
"male 4240.0 0.429245 0.495027 0.00 0.00 0.0 1.00 \n",
"age 4240.0 49.580189 8.572942 32.00 42.00 49.0 56.00 \n",
"education 4135.0 1.979444 1.019791 1.00 1.00 2.0 3.00 \n",
"currentSmoker 4240.0 0.494104 0.500024 0.00 0.00 0.0 1.00 \n",
"cigsPerDay 4211.0 9.005937 11.922462 0.00 0.00 0.0 20.00 \n",
"BPMeds 4187.0 0.029615 0.169544 0.00 0.00 0.0 0.00 \n",
"prevalentStroke 4240.0 0.005896 0.076569 0.00 0.00 0.0 0.00 \n",
"prevalentHyp 4240.0 0.310613 0.462799 0.00 0.00 0.0 1.00 \n",
"diabetes 4240.0 0.025708 0.158280 0.00 0.00 0.0 0.00 \n",
"totChol 4190.0 236.699523 44.591284 107.00 206.00 234.0 263.00 \n",
"sysBP 4240.0 132.354599 22.033300 83.50 117.00 128.0 144.00 \n",
"diaBP 4240.0 82.897759 11.910394 48.00 75.00 82.0 90.00 \n",
"BMI 4221.0 25.800801 4.079840 15.54 23.07 25.4 28.04 \n",
"heartRate 4239.0 75.878981 12.025348 44.00 68.00 75.0 83.00 \n",
"glucose 3852.0 81.963655 23.954335 40.00 71.00 78.0 87.00 \n",
"TenYearCHD 4240.0 0.151887 0.358953 0.00 0.00 0.0 0.00 \n",
"\n",
" max \n",
"male 1.0 \n",
"age 70.0 \n",
"education 4.0 \n",
"currentSmoker 1.0 \n",
"cigsPerDay 70.0 \n",
"BPMeds 1.0 \n",
"prevalentStroke 1.0 \n",
"prevalentHyp 1.0 \n",
"diabetes 1.0 \n",
"totChol 696.0 \n",
"sysBP 295.0 \n",
"diaBP 142.5 \n",
"BMI 56.8 \n",
"heartRate 143.0 \n",
"glucose 394.0 \n",
"TenYearCHD 1.0 "
]
},
"execution_count": 12,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# transpose table\n",
"df.describe().T"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"count 4240.000000\n",
"mean 49.580189\n",
"std 8.572942\n",
"min 32.000000\n",
"25% 42.000000\n",
"50% 49.000000\n",
"75% 56.000000\n",
"max 70.000000\n",
"Name: age, dtype: float64"
]
},
"execution_count": 13,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# for specific column \n",
"df['age'].describe() "
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" age | \n",
" BMI | \n",
"
\n",
" \n",
" \n",
" \n",
" count | \n",
" 4240.000000 | \n",
" 4221.000000 | \n",
"
\n",
" \n",
" mean | \n",
" 49.580189 | \n",
" 25.800801 | \n",
"
\n",
" \n",
" std | \n",
" 8.572942 | \n",
" 4.079840 | \n",
"
\n",
" \n",
" min | \n",
" 32.000000 | \n",
" 15.540000 | \n",
"
\n",
" \n",
" 25% | \n",
" 42.000000 | \n",
" 23.070000 | \n",
"
\n",
" \n",
" 50% | \n",
" 49.000000 | \n",
" 25.400000 | \n",
"
\n",
" \n",
" 75% | \n",
" 56.000000 | \n",
" 28.040000 | \n",
"
\n",
" \n",
" max | \n",
" 70.000000 | \n",
" 56.800000 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" age BMI\n",
"count 4240.000000 4221.000000\n",
"mean 49.580189 25.800801\n",
"std 8.572942 4.079840\n",
"min 32.000000 15.540000\n",
"25% 42.000000 23.070000\n",
"50% 49.000000 25.400000\n",
"75% 56.000000 28.040000\n",
"max 70.000000 56.800000"
]
},
"execution_count": 14,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# for multiple columns \n",
"df[['age', 'BMI']].describe() "
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Exploring Series "
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# read another dataset \n",
"titanic = pd.read_csv('http://bit.ly/kaggletrain')"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# examine first few rows \n",
"titanic.head() "
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Value Counts "
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# value_counts()\n",
"titanic['Sex'].value_counts() "
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# value_counts() in percent\n",
"titanic['Sex'].value_counts(normalize=True) "
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# returns a series \n",
"type(titanic['Sex'].value_counts(normalize=True))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Unique() "
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# unique() \n",
"titanic['Fare'].unique() "
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# return a numpy.ndarray\n",
"type(titanic['Fare'].unique())"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Cross Tabulation"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# crosstab \n",
"pd.crosstab(titanic['Sex'], titanic['Survived'])"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Describe "
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# describe a categorical column \n",
"titanic['Age'].describe() "
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Basic Statistics"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# mean()\n",
"titanic.Age.mean() "
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# max()\n",
"titanic.Age.max() "
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# min() \n",
"titanic.Age.min() "
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# median() \n",
"titanic.Age.median() "
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Visualization"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"%matplotlib inline "
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# barplot\n",
"titanic.Sex.value_counts().plot(kind=\"bar\") "
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# histogram\n",
"titanic.Age.plot(kind=\"hist\")"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.4"
},
"latex_envs": {
"LaTeX_envs_menu_present": true,
"autoclose": false,
"autocomplete": true,
"bibliofile": "biblio.bib",
"cite_by": "apalike",
"current_citInitial": 1,
"eqLabelWithNumbers": true,
"eqNumInitial": 1,
"hotkeys": {
"equation": "Ctrl-E",
"itemize": "Ctrl-I"
},
"labels_anchors": false,
"latex_user_defs": false,
"report_style_numbering": false,
"user_envs_cfg": false
},
"toc": {
"base_numbering": 1,
"nav_menu": {},
"number_sections": true,
"sideBar": true,
"skip_h1_title": false,
"title_cell": "Table of Contents",
"title_sidebar": "Contents",
"toc_cell": true,
"toc_position": {},
"toc_section_display": true,
"toc_window_display": false
}
},
"nbformat": 4,
"nbformat_minor": 2
}